import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import plotly.express as px
import missingno as msno
#################################################################
##### Setting pandas display options for pretty print
#################################################################
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)
pd.set_option('display.max_colwidth', 50)
# COVID data URL picked from https://github.com/owid/covid-19-data/tree/master/public/data
COVID_DATA_URL = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
COVID_DATA_FILE_NAME = "owid-covid-data.csv"
COVID_DATA_DICTIONARY_URL = "https://covid.ourworldindata.org/data/owid-covid-codebook.csv"
COVID_DATA_DICTIONARY_FILE_NAME = "owid-covid-codebook.csv"
RESULTS_PATH = "EDS_Covid19_Exp_results"
DATA_FOLDER = "data"
covid_data_full_file_path = os.path.join(RESULTS_PATH,
DATA_FOLDER,
COVID_DATA_FILE_NAME)
covid_data_dictionary_full_file_path = os.path.join(RESULTS_PATH,
DATA_FOLDER,
COVID_DATA_DICTIONARY_FILE_NAME)
# Creating experiment folders as required
if not os.path.isdir(RESULTS_PATH):
os.makedirs(RESULTS_PATH)
if not os.path.isdir(os.path.join(RESULTS_PATH,
DATA_FOLDER)):
os.makedirs(os.path.join(RESULTS_PATH,
DATA_FOLDER))
#################################################################
##### Covid data
#################################################################
if os.path.isfile(covid_data_full_file_path):
print("Local data file available:", covid_data_full_file_path)
df_full_covid_data = pd.read_csv(covid_data_full_file_path,
sep=",",
header=0)
print("Data load complete.")
else:
print("No local data file found. Downloading from URL:", COVID_DATA_URL)
df_full_covid_data = pd.read_csv(COVID_DATA_URL,
sep=",",
header=0)
df_full_covid_data.to_csv(covid_data_full_file_path,
sep=",",
header=True,
index=False)
print("Data downloaded and saved to:", covid_data_full_file_path)
#################################################################
##### Covid data dictionary
#################################################################
if os.path.isfile(covid_data_dictionary_full_file_path):
print("\nLocal dictionary file available:", covid_data_dictionary_full_file_path)
df_covid_data_dict = pd.read_csv(covid_data_dictionary_full_file_path,
sep=",",
header=0)
print("Dictionary load complete.")
else:
print("\nNo local dictionary file found. Downloading from URL:", COVID_DATA_DICTIONARY_URL)
df_covid_data_dict = pd.read_csv(COVID_DATA_DICTIONARY_URL,
sep=",",
header=0)
df_covid_data_dict.to_csv(covid_data_dictionary_full_file_path,
sep=",",
header=True,
index=False)
print("Dictionary downloaded and saved to:", covid_data_dictionary_full_file_path)
Local data file available: EDS_Covid19_Exp_results\data\owid-covid-data.csv Data load complete. Local dictionary file available: EDS_Covid19_Exp_results\data\owid-covid-codebook.csv Dictionary load complete.
#################################################################
##### codebook - contains details of every column specified by data provider
#################################################################
df_covid_data_dict[['column', 'description']]
| column | description | |
|---|---|---|
| 0 | iso_code | ISO 3166-1 alpha-3 – three-letter country codes |
| 1 | continent | Continent of the geographical location |
| 2 | location | Geographical location |
| 3 | date | Date of observation |
| 4 | total_cases | Total confirmed cases of COVID-19. Counts can ... |
| 5 | new_cases | New confirmed cases of COVID-19. Counts can in... |
| 6 | new_cases_smoothed | New confirmed cases of COVID-19 (7-day smoothe... |
| 7 | total_deaths | Total deaths attributed to COVID-19. Counts ca... |
| 8 | new_deaths | New deaths attributed to COVID-19. Counts can ... |
| 9 | new_deaths_smoothed | New deaths attributed to COVID-19 (7-day smoot... |
| 10 | total_cases_per_million | Total confirmed cases of COVID-19 per 1,000,00... |
| 11 | new_cases_per_million | New confirmed cases of COVID-19 per 1,000,000 ... |
| 12 | new_cases_smoothed_per_million | New confirmed cases of COVID-19 (7-day smoothe... |
| 13 | total_deaths_per_million | Total deaths attributed to COVID-19 per 1,000,... |
| 14 | new_deaths_per_million | New deaths attributed to COVID-19 per 1,000,00... |
| 15 | new_deaths_smoothed_per_million | New deaths attributed to COVID-19 (7-day smoot... |
| 16 | reproduction_rate | Real-time estimate of the effective reproducti... |
| 17 | icu_patients | Number of COVID-19 patients in intensive care ... |
| 18 | icu_patients_per_million | Number of COVID-19 patients in intensive care ... |
| 19 | hosp_patients | Number of COVID-19 patients in hospital on a g... |
| 20 | hosp_patients_per_million | Number of COVID-19 patients in hospital on a g... |
| 21 | weekly_icu_admissions | Number of COVID-19 patients newly admitted to ... |
| 22 | weekly_icu_admissions_per_million | Number of COVID-19 patients newly admitted to ... |
| 23 | weekly_hosp_admissions | Number of COVID-19 patients newly admitted to ... |
| 24 | weekly_hosp_admissions_per_million | Number of COVID-19 patients newly admitted to ... |
| 25 | total_tests | Total tests for COVID-19 |
| 26 | new_tests | New tests for COVID-19 (only calculated for co... |
| 27 | total_tests_per_thousand | Total tests for COVID-19 per 1,000 people |
| 28 | new_tests_per_thousand | New tests for COVID-19 per 1,000 people |
| 29 | new_tests_smoothed | New tests for COVID-19 (7-day smoothed). For c... |
| 30 | new_tests_smoothed_per_thousand | New tests for COVID-19 (7-day smoothed) per 1,... |
| 31 | positive_rate | The share of COVID-19 tests that are positive,... |
| 32 | tests_per_case | Tests conducted per new confirmed case of COVI... |
| 33 | tests_units | Units used by the location to report its testi... |
| 34 | total_vaccinations | Total number of COVID-19 vaccination doses adm... |
| 35 | people_vaccinated | Total number of people who received at least o... |
| 36 | people_fully_vaccinated | Total number of people who received all doses ... |
| 37 | total_boosters | Total number of COVID-19 vaccination booster d... |
| 38 | new_vaccinations | New COVID-19 vaccination doses administered (o... |
| 39 | new_vaccinations_smoothed | New COVID-19 vaccination doses administered (7... |
| 40 | total_vaccinations_per_hundred | Total number of COVID-19 vaccination doses adm... |
| 41 | people_vaccinated_per_hundred | Total number of people who received at least o... |
| 42 | people_fully_vaccinated_per_hundred | Total number of people who received all doses ... |
| 43 | total_boosters_per_hundred | Total number of COVID-19 vaccination booster d... |
| 44 | new_vaccinations_smoothed_per_million | New COVID-19 vaccination doses administered (7... |
| 45 | new_people_vaccinated_smoothed | Daily number of people receiving their first v... |
| 46 | new_people_vaccinated_smoothed_per_hundred | Daily number of people receiving their first v... |
| 47 | stringency_index | Government Response Stringency Index: composit... |
| 48 | population | Population (latest available values). See http... |
| 49 | population_density | Number of people divided by land area, measure... |
| 50 | median_age | Median age of the population, UN projection fo... |
| 51 | aged_65_older | Share of the population that is 65 years and o... |
| 52 | aged_70_older | Share of the population that is 70 years and o... |
| 53 | gdp_per_capita | Gross domestic product at purchasing power par... |
| 54 | extreme_poverty | Share of the population living in extreme pove... |
| 55 | cardiovasc_death_rate | Death rate from cardiovascular disease in 2017... |
| 56 | diabetes_prevalence | Diabetes prevalence (% of population aged 20 t... |
| 57 | female_smokers | Share of women who smoke, most recent year ava... |
| 58 | male_smokers | Share of men who smoke, most recent year avail... |
| 59 | handwashing_facilities | Share of the population with basic handwashing... |
| 60 | hospital_beds_per_thousand | Hospital beds per 1,000 people, most recent ye... |
| 61 | life_expectancy | Life expectancy at birth in 2019 |
| 62 | human_development_index | A composite index measuring average achievemen... |
| 63 | excess_mortality | Percentage difference between the reported num... |
| 64 | excess_mortality_cumulative | Percentage difference between the cumulative n... |
| 65 | excess_mortality_cumulative_absolute | Cumulative difference between the reported num... |
| 66 | excess_mortality_cumulative_per_million | Cumulative difference between the reported num... |
df_full_covid_data.shape
(197003, 67)
# Required fields for Delivery 1 and 2
required_fields = ['location', 'date',
'total_cases', 'population',
'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']
print('Non-country locations:\n', df_full_covid_data[df_full_covid_data['continent'].isnull()]['location'].unique())
Non-country locations: ['Africa' 'Asia' 'Europe' 'European Union' 'High income' 'International' 'Low income' 'Lower middle income' 'North America' 'Oceania' 'South America' 'Upper middle income' 'World']
df_full_covid_data = df_full_covid_data[df_full_covid_data['continent'].notnull()].reset_index(drop=True)
msno.bar(df_full_covid_data)
<AxesSubplot:>
msno.bar(df_full_covid_data[required_fields])
<AxesSubplot:>
msno.matrix(df_full_covid_data)
<AxesSubplot:>
msno.matrix(df_full_covid_data[required_fields])
<AxesSubplot:>
msno.dendrogram(df_full_covid_data)
<AxesSubplot:>
msno.dendrogram(df_full_covid_data[required_fields])
<AxesSubplot:>
df_missing_vaccination_by_country = df_full_covid_data[[required_fields[i] for i in [0, -3, -2, -1]]].drop('location',
axis=1).isna().groupby(df_full_covid_data.location).sum().reset_index()
df_missing_vaccination_by_country['location_records'] = df_full_covid_data['location'].value_counts().reset_index().sort_values(by='index')['location'].reset_index(drop=True)
df_missing_vaccination_by_country['people_vaccinated'] = df_missing_vaccination_by_country['people_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['people_fully_vaccinated'] = df_missing_vaccination_by_country['people_fully_vaccinated']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['total_boosters'] = df_missing_vaccination_by_country['total_boosters']/df_missing_vaccination_by_country['location_records']
df_missing_vaccination_by_country['mean_total_missing'] = (df_missing_vaccination_by_country['people_vaccinated'] + df_missing_vaccination_by_country['people_fully_vaccinated'] + df_missing_vaccination_by_country['total_boosters']) / 3
fig = px.bar(df_missing_vaccination_by_country, x='location', y='mean_total_missing')
fig.show()
df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True).head(10)
| location | people_vaccinated | people_fully_vaccinated | total_boosters | location_records | mean_total_missing | |
|---|---|---|---|---|---|---|
| 111 | Latvia | 0.349 | 0.365 | 0.372 | 852 | 0.362 |
| 198 | Switzerland | 0.366 | 0.366 | 0.377 | 853 | 0.370 |
| 218 | United States | 0.373 | 0.373 | 0.400 | 887 | 0.382 |
| 63 | Estonia | 0.382 | 0.382 | 0.392 | 875 | 0.385 |
| 117 | Lithuania | 0.364 | 0.385 | 0.426 | 849 | 0.392 |
| 153 | Norway | 0.342 | 0.374 | 0.471 | 853 | 0.396 |
| 70 | France | 0.385 | 0.386 | 0.419 | 885 | 0.397 |
| 54 | Denmark | 0.365 | 0.381 | 0.445 | 876 | 0.397 |
| 7 | Argentina | 0.400 | 0.400 | 0.400 | 908 | 0.400 |
| 35 | Canada | 0.418 | 0.388 | 0.438 | 886 | 0.415 |
country_list = list(df_missing_vaccination_by_country.sort_values(by='mean_total_missing', ascending=True)['location'].head(3))
df_3country_covid_data = df_full_covid_data[df_full_covid_data['location'].isin(country_list)][required_fields].reset_index(drop=True)
impute_column_list = ['total_cases', 'population',
'people_vaccinated', 'people_fully_vaccinated', 'total_boosters']
msno.matrix(df_3country_covid_data)
<AxesSubplot:>
df_3country_covid_data_imputed = None
for current_country in list(df_3country_covid_data['location'].unique()):
temp_df_current_country_data = df_3country_covid_data[df_3country_covid_data['location'] == current_country]
temp_df_current_country_data = temp_df_current_country_data.sort_values(by='date', ascending=True)
for col in impute_column_list:
temp_df_current_country_data[col] = temp_df_current_country_data[col].interpolate(limit_direction="both")
if df_3country_covid_data_imputed is None:
df_3country_covid_data_imputed = temp_df_current_country_data
else:
df_3country_covid_data_imputed = pd.concat([df_3country_covid_data_imputed, temp_df_current_country_data],
ignore_index=True
)
msno.matrix(df_3country_covid_data_imputed)
<AxesSubplot:>
df_3country_covid_data_imputed['infector_percentage'] = df_3country_covid_data_imputed['total_cases']/df_3country_covid_data_imputed['population']
fig = px.line(df_3country_covid_data_imputed,
x='date', y='infector_percentage',
color='location',
# title = 'The relative cases overtime of Covid infectors'
)
fig.update_layout(
yaxis_title="Total cases as percentage of population",
xaxis_title="Timeline",
title = {
'text':'The total affected rate (percentage of the population) over time',
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'
},
legend = {
'title':{'text':'Location'}
}
)
fig.show(renderer='notebook')
df_3country_covid_data_imputed['vaccination_dose1_percentage'] = df_3country_covid_data_imputed['people_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose2_percentage'] = df_3country_covid_data_imputed['people_fully_vaccinated']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed['vaccination_dose3_percentage'] = df_3country_covid_data_imputed['total_boosters']/df_3country_covid_data_imputed['population']
df_3country_covid_data_imputed_Transformed = None
for dose in range(1,4):
col_list = ['location', 'date', 'vaccination_dose{}_percentage'.format(dose)]
temp_df = df_3country_covid_data_imputed[col_list]
temp_df['location'] = temp_df['location']+', '+str(dose)
temp_df.columns = ['location, dose', 'date', 'vaccination_percentage']
if df_3country_covid_data_imputed_Transformed is None:
df_3country_covid_data_imputed_Transformed = temp_df
else:
df_3country_covid_data_imputed_Transformed = pd.concat([df_3country_covid_data_imputed_Transformed, temp_df],#
ignore_index=True
)
fig = None
fig = px.line(df_3country_covid_data_imputed_Transformed,
x='date', y='vaccination_percentage',
color='location, dose',
)
fig.update_layout(
yaxis_title="Percentage of population vaccinated",
xaxis_title="Timeline",
title = {
'text':'The vaccination rate (percentage of the population) over time',
'x':0.45,
'xanchor': 'center',
'yanchor': 'top'
},
legend = {
'title':{'text':'Location, Dosage'}
}
)
fig.show(renderer='notebook')